#Prerequisites. Task1: Install and import tidyverse pacakage.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Task2: Creating tibbles with as_tibble().
as_tibble(iris)
Task3: Creating a new tibble from individual vectors with tibble().
tibble(
x= 1:5,
y= 1,
z= x^2 + y
)
Task4:
tb <- tibble(
`:)` = "smile",
` ` = "space",
`2000` = "number"
)
tb
Task5:
tribble(
~x, ~y, ~z,
#--|--|----
"a", 2, 3.6,
"b", 1, 8.5
)
Task6:
tibble(
a = lubridate::now() + runif(1e3) * 86400,
b = lubridate::today() + runif(1e3) * 30,
c = 1:1e3,
d = runif(1e3),
e = sample(letters, 1e3, replace = TRUE)
)
Task7:
nycflights13::flights %>%
print(n= 10, width = Inf)
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 11 UA 1545 N14228 EWR IAH 227 1400 5 15
## 2 20 UA 1714 N24211 LGA IAH 227 1416 5 29
## 3 33 AA 1141 N619AA JFK MIA 160 1089 5 40
## 4 -18 B6 725 N804JB JFK BQN 183 1576 5 45
## 5 -25 DL 461 N668DN LGA ATL 116 762 6 0
## 6 12 UA 1696 N39463 EWR ORD 150 719 5 58
## 7 19 B6 507 N516JB EWR FLL 158 1065 6 0
## 8 -14 EV 5708 N829AS LGA IAD 53 229 6 0
## 9 -8 B6 79 N593JB JFK MCO 140 944 6 0
## 10 8 AA 301 N3ALAA LGA ORD 138 733 6 0
## time_hour
## <dttm>
## 1 2013-01-01 05:00:00
## 2 2013-01-01 05:00:00
## 3 2013-01-01 05:00:00
## 4 2013-01-01 05:00:00
## 5 2013-01-01 06:00:00
## 6 2013-01-01 05:00:00
## 7 2013-01-01 06:00:00
## 8 2013-01-01 06:00:00
## 9 2013-01-01 06:00:00
## 10 2013-01-01 06:00:00
## # ℹ 336,766 more rows
Task8:
nycflights13::flights %>%
View()
#Subsetting. Task9:
df <- tibble(
x = runif(5),
y = rnorm(5)
)
# Extract by name
df$x
## [1] 0.1336431 0.2271860 0.1601787 0.6879047 0.3632293
df[["x"]]
## [1] 0.1336431 0.2271860 0.1601787 0.6879047 0.3632293
# Extract by position
df[[1]]
## [1] 0.1336431 0.2271860 0.1601787 0.6879047 0.3632293
Task10: Need to use the special placeholder to use these in a pipe.
df %>% .$x
## [1] 0.1336431 0.2271860 0.1601787 0.6879047 0.3632293
df %>% .[["x"]]
## [1] 0.1336431 0.2271860 0.1601787 0.6879047 0.3632293
Task11:
class(as.data.frame(tb))
## [1] "data.frame"
#Exercise.
as_tibble(mtcars)
df <- data.frame(abc =1, xyz = "a")
df$x
## [1] "a"
df[, "xyz"]
## [1] "a"
df[, c("abc", "xyz")]
# Assuming var <- "mpg"
as_tibble(mtcars)
var <- "mpg"
#column <- tibble %>%
# {.[[var]]}
Task12:
heights <- read_csv("heights.csv")
## Rows: 1192 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): sex, race
## dbl (4): earn, height, ed, age
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Task13:
read_csv("a,b,c
1,2,3
4,5,6")
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): a, b, c
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Task14:
read_csv("The first line of metadata
The second line of metadata
x,y,z
1,2,3", skip = 2)
## Rows: 1 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): x, y, z
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Task15:
read_csv("# A comment I want to skip
x,y,z
1,2,3", comment = "#")
## Rows: 1 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): x, y, z
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Task16:
read_csv("1,2,3\n4,5,6", col_names = FALSE)
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): X1, X2, X3
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Task17:
read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): x, y, z
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Task18:
read_csv("a,b,c\n1,2,.", na = ".")
## Rows: 1 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): a, b
## lgl (1): c
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Parsing a vector. Task19:
str(parse_logical(c("TRUE", "FALSE", "NA")))
## logi [1:3] TRUE FALSE NA
str(parse_integer(c("1", "2", "3")))
## int [1:3] 1 2 3
str(parse_date(c("2010-01-01", "1979-10-14")))
## Date[1:2], format: "2010-01-01" "1979-10-14"
Task20:
parse_integer(c("1", "231", ".", "456"), na=".")
## [1] 1 231 NA 456
#If parsing fails, you’ll get a warning: Task21:
x<- parse_integer(c("123", "345", "abc", "123.45"))
## Warning: 2 parsing failures.
## row col expected actual
## 3 -- no trailing characters abc
## 4 -- no trailing characters 123.45
#The failures will be misssing in the output: Task22:
x
## [1] 123 345 NA NA
## attr(,"problems")
## # A tibble: 2 × 4
## row col expected actual
## <int> <int> <chr> <chr>
## 1 3 NA no trailing characters abc
## 2 4 NA no trailing characters 123.45
Task23:
problems(x)
#Numbers Task24:
parse_double("1.23")
## [1] 1.23
parse_double("1,23", locale = locale(decimal_mark = "," ))
## [1] 1.23
Task24:
parse_number("$100")
## [1] 100
parse_number("20%")
## [1] 20
parse_number("It cost $123.45")
## [1] 123.45
Task25:
#Used in America
parse_number("$123,456,789")
## [1] 123456789
#Used in many parts of Europe.
parse_number("123.456.789", locale = locale(grouping_mark = "."))
## [1] 123456789
#Used in Switzerland.
parse_number("123'456'789", locale = locale(grouping_mark = "'"))
## [1] 123456789
#Srings Task26:
charToRaw("Hadley")
## [1] 48 61 64 6c 65 79
Task27:
x1<- "El Ni\xf10 was particularly bad this year"
x2<- "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd"
x1
## [1] "El Ni\xf10 was particularly bad this year"
x2
## [1] "\x82\xb1\x82\xf1\x82ɂ\xbf\x82\xcd"
Task28:
parse_character(x1, locale = locale(encoding = "Latin1"))
## [1] "El Niñ0 was particularly bad this year"
parse_character(x2, locale = locale(encoding = "Shift-JIS"))
## [1] "こんにちは"
Task29:
guess_encoding(charToRaw(x1))
guess_encoding(charToRaw(x2))
Task30:
fruit <- c("apple", "banana")
parse_factor(c("apple", "banana", "bananana"), levels = fruit)
## Warning: 1 parsing failure.
## row col expected actual
## 3 -- value in level set bananana
## [1] apple banana <NA>
## attr(,"problems")
## # A tibble: 1 × 4
## row col expected actual
## <int> <int> <chr> <chr>
## 1 3 NA value in level set bananana
## Levels: apple banana
#Dates, date-times, and times Task31:
parse_datetime("2010-10-01T2010")
## [1] "2010-10-01 20:10:00 UTC"
parse_datetime("20101010")
## [1] "2010-10-10 UTC"
Task32:
parse_date("2010-10-01")
## [1] "2010-10-01"
Task33:
library(readr)
library(hms)
##
## Attaching package: 'hms'
## The following object is masked from 'package:lubridate':
##
## hms
Task34:
parse_time("01:10am")
## 01:10:00
parse_time("20:10:01")
## 20:10:01
#Non-digits Task34:
parse_date("01/02/15", "%m/%d/%y")
## [1] "2015-01-02"
parse_date("01/02/15", "%d/%m/%y")
## [1] "2015-02-01"
parse_date("01/02/15", "%y/%m/%d")
## [1] "2001-02-15"
Task35:
parse_date("1 janvier 2015", "%d %B %Y", locale = locale("fr"))
## [1] "2015-01-01"
#Parsing a file. # Strategy Task36:
guess_parser("2010-10-01")
## [1] "date"
guess_parser("15:01")
## [1] "time"
guess_parser(c("TRUE", "FALSE"))
## [1] "logical"
guess_parser(c("1","5","9"))
## [1] "double"
guess_parser(c("12,352,561"))
## [1] "number"
str(parse_guess("2010-10-10"))
## Date[1:1], format: "2010-10-10"
#Problems Task37:
challenge <- read_csv(readr_example("challenge.csv"))
## Rows: 2000 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): x
## date (1): y
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Task38:
problems(challenge)
Task39:
tail(challenge)
Task40-
challenge <- read_csv(
readr_example("challenge.csv"),
col_types = cols(
x = col_double(),
y = col_logical()
)
)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
challenge <- read_csv(
readr_example("challenge.csv"),
col_types = cols(
x = col_double(),
y = col_date()
)
)
tail(challenge)
#Other strategies Task41-
challenge2 <- read_csv(readr_example("challenge.csv"), guess_max = 1001)
## Rows: 2000 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): x
## date (1): y
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Task42-
challenge2
Task43-
challenge2 <- read_csv(readr_example("challenge.csv"),
col_types = cols(.default = col_character())
)
Task44-
df <- tribble(
~x, ~y,
"1", "1.21",
"2", "2.32",
"3", "4.56"
)
df
Task45-
type_convert(df)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## x = col_double(),
## y = col_double()
## )
#Writing to a file. Task46-
write_csv(challenge, "challenge.csv")
Task47-
challenge
Task48-
write_csv(challenge, "challenge-2.csv")
read_csv("challenge-2.csv")
## Rows: 2000 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): x
## date (1): y
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Task49-
write_rds(challenge, "challenge.rds")
read_rds("challenge.rds")
Task50-
#install.packages("feather")
library(feather)
write_feather(challenge, "challenge.feather")
read_feather("challenge.feather")
#Tidy data. Task51-
table1
table2
table3
Task52-
# Spread across two tibbles
table4a # cases
table4b # population
Task53:
# Compute rate per 10,000
table1 %>%
mutate(rate = cases / population * 10000)
Task54:
# Compute cases per year
table1 %>%
count(year, wt = cases)
Task55:
# Visualise changes over time
library(ggplot2)
ggplot(table1, aes(year, cases)) +
geom_line(aes(group = country), colour = "grey50") +
geom_point(aes(colour = country))
Task56:
table4a
Task57:
table4a %>%
pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")
Task58:
tidy4a <- table4a %>%
pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")
tidy4b <- table4b %>%
pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "population")
left_join(tidy4a, tidy4b)
## Joining with `by = join_by(country, year)`
Task59:
table2
Task60:
table2 %>%
pivot_wider(names_from = type, values_from = count)
#Separating and uniting. #Separate Task61:
table3
Task62:
table3 %>%
separate(rate, into = c("cases", "population"))
Task63:
table3 %>%
separate(rate, into = c("cases", "population"), sep = "/")
Task64:
table3 %>%
separate(rate, into = c("cases", "population"), convert = TRUE)
Task65:
table3 %>%
separate(year, into = c("century", "year"), sep = 2)
#Unite Task66:
table5 %>%
unite(new, century, year)
Task67:
table5 %>%
unite(new, century, year, sep = "")
#Missing values Task68:
stocks <- tibble(
year = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
qtr = c( 1, 2, 3, 4, 2, 3, 4),
return = c(1.88, 0.59, 0.35, NA, 0.92, 0.17, 2.66)
)
Task69:
stocks %>%
pivot_wider(names_from = year, values_from = return)
Task70:
stocks %>%
pivot_wider(names_from = year, values_from = return) %>%
pivot_longer(
cols = c(`2015`, `2016`),
names_to = "year",
values_to = "return",
values_drop_na = TRUE
)
Task71:
stocks %>%
complete(year, qtr)
Task72:
treatment <- tribble(
~ person, ~ treatment, ~response,
"Derrick Whitmore", 1, 7,
NA, 2, 10,
NA, 3, 9,
"Katherine Burke", 1, 4
)
Task73:
treatment %>%
fill(person)
#Case Study
who
Task74:
who1 <- who %>%
pivot_longer(
cols = new_sp_m014:newrel_f65,
names_to = "key",
values_to = "cases",
values_drop_na = TRUE
)
who1
Task75:
who1 %>%
count(key)
Task76:
who2 <- who1 %>%
mutate(key = stringr::str_replace(key, "newrel", "new_rel"))
who2
Task77:
who3 <- who2 %>%
separate(key, c("new", "type", "sexage"), sep = "_")
who3
Task78:
who3 %>%
count(new)
Task79:
who4 <- who3 %>%
select(-new, -iso2, -iso3)
Task80:
who5 <- who4 %>%
separate(sexage, c("sex", "age"), sep = 1)
who5
Task81:
who %>%
pivot_longer(
cols = new_sp_m014:newrel_f65,
names_to = "key",
values_to = "cases",
values_drop_na = TRUE
) %>%
mutate(
key = stringr::str_replace(key, "newrel", "new_rel")
) %>%
separate(key, c("new", "var", "sexage")) %>%
select(-new, -iso2, -iso3) %>%
separate(sexage, c("sex", "age"), sep = 1)